*********************************************************************************
*** Assign surface water pollution to infant mortality sites 
*** Use water pollution data from 2004 to 2010 
*********************************************************************************

#delimit ;  set type double, permanentely ; 
clear ;  clear matrix ; clear mata ; 
set matsize 5000 ;  set maxvar 5000 ;   set more off;   set rmsg on ;  pause on;  

tempfile f1 f2 f3 f4 f5 ; 

*** set path here *** 

*** water quality in 2008-2010, take average to avoid the influence of outliers ;  

use $rawdata\Water_Pollution_1999_2010_clean1_26_2017.dta, clear ; 
ren 水质 waterquality ; 
keep waterquality ID year Longitud_GG Latitud_GG ;  
xtset ID year ; 
xtbalance, range (2004 2010) miss(waterquality) ;     // check consistency with the expansion funciton below 
ren ID waterID ;   
save $pathdata\waterpollution_panel.dta, replace ;    


*** assign a panel pollution level to each IMR sites ;   

use $pathdata\waterpollution_panel.dta, clear ; 
keep waterID Longitud_GG Latitud_GG ; 
duplicates drop waterID, force ;   
save `f2' ;  

use $path\IMR_sites.dta, clear ; 
cross using `f2' ;   
geodist latitude longitude Latitud_GG Longitud_GG, gen(x) ; 
gen dist=x ;   
label var dist "IMR sites to cloest water pollution monitor" ;  


replace dist=. if dist>100 ;     // drop all pairs that are too far from each other, 283 IMR sites left, 48 IMR sites cannot find pollution monitors within 100KM radius 

bysort code: egen mindist=min(dist) ;  
bysort code: gen ind=mindist<=25  ;      
drop if ind==1 & dist>mindist  ;       // use the nearest pollution monitor point if there is one within 25KM radius  

recode dist (0=1)  ;    // 重合点的weight=1    
bysort code: egen deno_wgh=sum(1/dist) ;   
gen weights=(1/dist)/deno_wgh ;     // if no water pollution sites within 25km radius, use weighted avearge within 100KM radius      


keep code jcdmc waterID weights dist ;        
sort code dist ;       
drop if weights == . ;       
drop dist ;    
expand 7, gen(year) ;  
save $pathdata\water_IMR_rules.dta, replace ;       
clear ; 
set obs 7 ; 
gen year=2003+_n ; 
cross using $pathdata\water_IMR_rules.dta ;    
save $pathdata\water_IMR_rules.dta, replace ;       


use $pathdata\water_IMR_rules.dta, clear ;  		 
merge m:1 waterID year using $pathdata\waterpollution_panel.dta ;  
keep if _merge == 3  ;  

collapse  (mean) waterquality [w=weights], by(code year) ; 
ren waterquality waterquality_yearly ;   
label var waterquality_year "Yearly water quality" ; 
mdesc ; 

merge m:1 code using $pathdata\IMR_water_pollution_match.dta ;    
replace year=2004 if year==. ;   
tsset code year ; 
tsfill, full ; 
bysort code: egen x=mean(waterquality) ;     
replace waterquality = x ; 
replace waterquality=. if year<2008 ;      
drop _merge x ; 
save $pathdata\IMR_water_pollution_match.dta, replace ;  









